library(haven)
library(MuMIn)
library(arrow)
## 
## Attaching package: 'arrow'
## The following object is masked from 'package:utils':
## 
##     timestamp
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::duration() masks arrow::duration()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::lag()          masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(nlme)
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:lme4':
## 
##     lmList
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(ggeffects)
library(rlang)
## 
## Attaching package: 'rlang'
## 
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice
## 
## The following object is masked from 'package:arrow':
## 
##     string
library(tableone)
library(r2mlm)
## 
## Please cite as: 
## 
##  Shaw, M., Rights, J.D., Sterba, S.S., Flake, J.K. (2023). r2mlm: An R package calculating R-squared measures for multilevel models. Behavior Research Methods, 55(4), 1942-1964. doi:10.3758/s13428-022-01841-4
library(ggpubr)

# Read environment variables from a .env file
readRenviron(".env")

dat_dir <- Sys.getenv("DAT_DIR") # Data directory
man_img_dir <- Sys.getenv("MAN_IMG_DIR") # Manuscript image directory

Demographics

dem <- readRDS(file.path(dat_dir, "CLEAR3_Biaffect_Leow", "clear3trait.rds")) %>%
  as_factor() %>% # Convert haven_labelled variables to factors
  mutate(
    id = as.character(id),
    race_clear123 = as_factor(race_clear123),
    ethnicity = as_factor(ethnicity)
  ) %>%
  arrange(id) %>%
  mutate(across(where(is.character), ~ if_else(.x == "", "Unknown", .x))) %>%
  rename(Age = age, Gender = gender, `House income` = houseincomecat)

# Given to me by Anisha
dem[dem$id == "3085",]$Age <- 24
dem[dem$id == "3100",]$Age <- 23

colnames(dem)
##  [1] "id"                         "Age"                       
##  [3] "race_clear123"              "ethnicity"                 
##  [5] "bmi"                        "filing_status"             
##  [7] "sex_orient"                 "Gender"                    
##  [9] "House income"               "currentdx_bipolar1"        
## [11] "currentdx_bipolar2"         "currentdx_cyclothymic"     
## [13] "currentdx_othbipolar"       "currentdx_bipAMC"          
## [15] "currentdx_sindbip"          "currentdx_MDD"             
## [17] "currentdx_PDD"              "currentdx_othDD"           
## [19] "currentdx_DDAMC"            "currentdx_SiDD"            
## [21] "currentdx_Schizophrenia"    "currentdx_schizophreniform"
## [23] "currentdx_schizoaffective"  "currentdx_delusionaldo"    
## [25] "currentdx_briefpsychoticdo" "currentdx_psychoticAMC"    
## [27] "currentdx_sipsychoticdo"    "currentdx_otherpsychotic"  
## [29] "currentdx_SUDalc"           "currentdx_SUDsha"          
## [31] "currentdx_SUDmj"            "currentdx_SUDstim"         
## [33] "currentdx_SUDopioid"        "currentdx_SUDpcp"          
## [35] "currentdx_SUDothhall"       "currentdx_SUDinhal"        
## [37] "currentdx_SUDother"         "currentdx_PanicDO"         
## [39] "currentdx_Agoraphobia"      "currentdx_SAD"             
## [41] "currentdx_Phobi"            "currentdx_GAD"             
## [43] "currentdx_OtherAnx"         "currentdx_AnxAMC"          
## [45] "currentdx_siANX"            "currentdx_OCD"             
## [47] "currentdx_otherOC"          "currentdx_OCDAMC"          
## [49] "currentdx_siOCD"            "currentdx_AN"              
## [51] "currentdx_BN"               "currentdx_BED"             
## [53] "currentdx_otheat"           "currentdx_ADHD"            
## [55] "currentdx_AcuteStressDO"    "currentdx_PTSD"            
## [57] "currentdx_AdjustmentDO"     "currentdx_Othertrauma"     
## [59] "currentdx_PMDD_prov"        "currentdx_BPD"

Oura ring

dat_oura <- readRDS(
  file.path(dat_dir, "CLEAR3_Biaffect_Leow", "clear3baseline_sleep.rds")) %>%
  mutate(
    Oura_Sleep_hour = Oura_Sleep_TST / 3600, # Second to hour
    sleepdur_yest = sleepdur_yest + 1 # Likert to hours
  ) %>%
  filter(id != 3086 | daterated != ymd("2022-07-31")) # Duplicate entries

colnames(dat_oura)
##  [1] "id"              "daterated"       "cleartrialphase" "sleepdur_yest"  
##  [5] "SleepLNQuality"  "Oura_Sleep_TST"  "sedative_YN"     "stim_YN"        
##  [9] "SSRI"            "SNRI"            "bzd_YN"          "Oura_Sleep_hour"

Convert to parquet to make accessible to Python.

dat_oura_raw <- readRDS(
  file.path(dat_dir, "CLEAR3_Biaffect_Leow", "clear3baseline_sleep.rds"))

write_parquet(dat_oura_raw, file.path(dat_dir, 
                                      "CLEAR3_Biaffect_Leow", 
                                      "clear3baseline_sleep.parquet"))
ggplot(dat_oura, aes(sleepdur_yest, Oura_Sleep_hour)) +
  geom_jitter() +
  geom_abline(intercept = 0, slope = 1) +
  labs(title = "Recorded vs reported sleep time across participants (jittered)") +
  xlab("Self-reported sleep duration (hour)") +
  ylab("Oura-recorded total sleep time (hour)") +
  theme(text = element_text(size = 18))
## Warning: Removed 8760 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(dat_oura, aes(SleepLNQuality, Oura_Sleep_hour)) +
  geom_point() +
  labs(title = "Recorded sleep time vs reported sleep quality across participants") +
  xlab("Self-reported sleep quality (higher is better)") +
  ylab("Oura-recorded total sleep time (hour)") +
  theme(text = element_text(size = 18))
## Warning: Removed 8763 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(dat_oura, aes(sleepdur_yest, SleepLNQuality)) +
  geom_jitter() +
  labs(title = "Reported sleep time vs quality across participants") +
  xlab("Self-reported sleep duration (hour)") +
  ylab("Self-reported sleep quality (higher is better)") +
  theme(text = element_text(size = 18))
## Warning: Removed 2682 rows containing missing values or values outside the scale range
## (`geom_point()`).

Participants with self-report data:

tmp <- dat_oura %>%
  drop_na(sleepdur_yest)

n_distinct(tmp$id)
## [1] 150

Participants with Oura data:

tmp <- dat_oura %>%
  drop_na(Oura_Sleep_TST)

n_distinct(tmp$id)
## [1] 60
cor.test(dat_oura$sleepdur_yest, dat_oura$Oura_Sleep_hour, method = "spearman")
## Warning in cor.test.default(dat_oura$sleepdur_yest, dat_oura$Oura_Sleep_hour, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  dat_oura$sleepdur_yest and dat_oura$Oura_Sleep_hour
## S = 289552223, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.739788

Reading data

# The values in sleepdur_yest are not the raw number of hours slept, but
# rather an index to the response options:
# 1 - 2 or fewer hours
# 2 - 3 hours
# 3 - 4 hours
# ...
# 9 - 10 or more hours
dat <- read_parquet(file.path(dat_dir, "spleep_pred_10_90_alpha1_2024-11-14.parquet")) %>%
  mutate(
    Oura_Sleep_hour = Oura_Sleep_TST / 3600, # Second to hour
    sleepdur_yest = sleepdur_yest + 1
  ) %>%
  filter(subject != "3086" | date != ymd("2022-07-31")) # Duplicate entries

dat_dem <- dat %>%
  left_join(dem, by = c("subject" = "id"))

dat_cmplt_sr <- dat_dem %>%
  drop_na(hour_estimate, sleepdur_yest)

dat_cmplt_oura <- dat_dem %>%
  drop_na(c(hour_estimate, Oura_Sleep_hour))

colnames(dat)
##  [1] "subject"         "date"            "cleartrialphase" "sleepdur_yest"  
##  [5] "SleepLNQuality"  "Oura_Sleep_TST"  "sedative_YN"     "stim_YN"        
##  [9] "SSRI"            "SNRI"            "bzd_YN"          "baseline"       
## [13] "dayNumber"       "activity"        "label"           "hour_estimate"  
## [17] "n_total_presses" "n_active_hours"  "Oura_Sleep_hour"

Exclude the first few days (amounts to a total of 115) to prevent leakage of information from the test set via temporal correlation. 44 participants.

# We can only reliably find the last training days in the full data set
last_train_days <- dat %>%
  drop_na(dayNumber) %>%
  filter(label == "train") %>%
  group_by(subject) %>%
  summarize(last_train_day = tail(dayNumber, n = 1))

# Helper function
filter_mutate <- function(.x) {
  .x %>%
    left_join(last_train_days, by = "subject") %>%
    filter(label == "test", dayNumber > last_train_day + 3) %>% # Don't include first three days in test region
    mutate(
      activity_c = scale(activity),
      hour_estimate_c = drop(scale(hour_estimate)),
      age_c = scale(Age)
    ) %>%
    group_by(subject) %>%
    mutate(
      hour_estimate_cluster_centered = drop(scale(hour_estimate, scale = FALSE)),
      hour_estimate_cluster_mean = mean(hour_estimate, na.rm = TRUE)
    ) %>%
    ungroup()
}

dat_test_sr <- dat_cmplt_sr %>%
  filter_mutate()

dat_test_oura <- dat_cmplt_oura %>%
  filter_mutate()

colnames(dat_test_oura)
##  [1] "subject"                        "date"                          
##  [3] "cleartrialphase"                "sleepdur_yest"                 
##  [5] "SleepLNQuality"                 "Oura_Sleep_TST"                
##  [7] "sedative_YN"                    "stim_YN"                       
##  [9] "SSRI"                           "SNRI"                          
## [11] "bzd_YN"                         "baseline"                      
## [13] "dayNumber"                      "activity"                      
## [15] "label"                          "hour_estimate"                 
## [17] "n_total_presses"                "n_active_hours"                
## [19] "Oura_Sleep_hour"                "Age"                           
## [21] "race_clear123"                  "ethnicity"                     
## [23] "bmi"                            "filing_status"                 
## [25] "sex_orient"                     "Gender"                        
## [27] "House income"                   "currentdx_bipolar1"            
## [29] "currentdx_bipolar2"             "currentdx_cyclothymic"         
## [31] "currentdx_othbipolar"           "currentdx_bipAMC"              
## [33] "currentdx_sindbip"              "currentdx_MDD"                 
## [35] "currentdx_PDD"                  "currentdx_othDD"               
## [37] "currentdx_DDAMC"                "currentdx_SiDD"                
## [39] "currentdx_Schizophrenia"        "currentdx_schizophreniform"    
## [41] "currentdx_schizoaffective"      "currentdx_delusionaldo"        
## [43] "currentdx_briefpsychoticdo"     "currentdx_psychoticAMC"        
## [45] "currentdx_sipsychoticdo"        "currentdx_otherpsychotic"      
## [47] "currentdx_SUDalc"               "currentdx_SUDsha"              
## [49] "currentdx_SUDmj"                "currentdx_SUDstim"             
## [51] "currentdx_SUDopioid"            "currentdx_SUDpcp"              
## [53] "currentdx_SUDothhall"           "currentdx_SUDinhal"            
## [55] "currentdx_SUDother"             "currentdx_PanicDO"             
## [57] "currentdx_Agoraphobia"          "currentdx_SAD"                 
## [59] "currentdx_Phobi"                "currentdx_GAD"                 
## [61] "currentdx_OtherAnx"             "currentdx_AnxAMC"              
## [63] "currentdx_siANX"                "currentdx_OCD"                 
## [65] "currentdx_otherOC"              "currentdx_OCDAMC"              
## [67] "currentdx_siOCD"                "currentdx_AN"                  
## [69] "currentdx_BN"                   "currentdx_BED"                 
## [71] "currentdx_otheat"               "currentdx_ADHD"                
## [73] "currentdx_AcuteStressDO"        "currentdx_PTSD"                
## [75] "currentdx_AdjustmentDO"         "currentdx_Othertrauma"         
## [77] "currentdx_PMDD_prov"            "currentdx_BPD"                 
## [79] "last_train_day"                 "activity_c"                    
## [81] "hour_estimate_c"                "age_c"                         
## [83] "hour_estimate_cluster_centered" "hour_estimate_cluster_mean"

Descriptive statistics

ggplot(dat_oura %>% drop_na(Oura_Sleep_TST), aes(id)) +
  geom_bar() +
  labs(x = "Participant ID", y = "Count") +
  theme(
    text = element_text(size = 18),
    axis.text.x = element_text(angle = 90, vjust = 0.5)
  )

ggplot(dat_test_oura, aes(subject)) +
  geom_bar() +
  labs(x = "Participant ID", y = "Count") +
  theme(
    text = element_text(size = 18),
    axis.text.x = element_text(angle = 90, vjust = 0.5)
  )

Self-report.

ggplot(dat_test_sr, aes(sleepdur_yest)) +
  geom_histogram() +
  labs(x = "Self-reported sleep duration")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dat_test_sr, aes(SleepLNQuality)) +
  geom_histogram() +
  labs(x = "Self-reported sleep quality")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dat_test_sr, aes(activity)) +
  geom_histogram() +
  labs(x = "Activity score")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dat_test_sr, aes(hour_estimate)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dat_test_sr, aes(activity_c, sleepdur_yest)) +
  geom_jitter()

ggplot(dat_test_sr, aes(hour_estimate, sleepdur_yest)) +
  geom_jitter() +
  geom_abline(intercept = 0, slope = 1) +
  labs(title = "Reported vs estimated sleep (jittered)", 
       x = "Hour estimate", y = "Self-reported sleep duration") +
  theme(text = element_text(size=16))

ggplot(dat_test_sr, aes(hour_estimate, SleepLNQuality)) +
  geom_jitter(alpha = 0.2) +
  labs(title = "Reported quality vs estimated duration (jittered)", 
       x = "Hour estimate", y = "Self-reported sleep quality") +
  theme(text = element_text(size=16))

Oura ring:

n_presses_breaks = c(400, 1100, 3000, 8100, 22000)
ggplot(dat_test_oura, aes(hour_estimate, Oura_Sleep_hour)) +
  geom_jitter(aes(color = n_total_presses)) +
  geom_abline(intercept = 0, slope = 1) +
  scale_color_gradient(name = "Number of key presses", trans = "log",
                       breaks = n_presses_breaks, labels = n_presses_breaks) +
  labs(title = "Recorded vs estimated sleep (jittered)", 
       x = "Hour estimate", y = "Oura sleep duration") +
  theme(text = element_text(size=16))

ggplot(dat_test_oura, aes(hour_estimate, Oura_Sleep_hour)) +
  geom_jitter(aes(color = n_active_hours)) +
  geom_abline(intercept = 0, slope = 1) +
  labs(title = "Recorded vs estimated sleep (jittered)", 
       x = "Hour estimate", y = "Oura sleep duration") +
  theme(text = element_text(size=16))

Self-report and Oura ring:

# 1887 observations
tmp <- dat_oura %>%
  drop_na(sleepdur_yest, Oura_Sleep_hour)

cor.test(tmp$sleepdur_yest, tmp$Oura_Sleep_hour, method = "spearman")
## Warning in cor.test.default(tmp$sleepdur_yest, tmp$Oura_Sleep_hour, method =
## "spearman"): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  tmp$sleepdur_yest and tmp$Oura_Sleep_hour
## S = 289552223, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.739788

Overall:

# Include all Oura and self-report data points, filter BiAffect data points
# based on train/test label
dat_descr <- dat %>%
  left_join(last_train_days, by = "subject") %>%
  select(subject, date, sleepdur_yest, Oura_Sleep_hour, hour_estimate, 
         label, dayNumber, last_train_day) %>%
  pivot_longer(sleepdur_yest:hour_estimate, names_to = "variable") %>%
  filter(variable != "hour_estimate" | (label == "test" & dayNumber > last_train_day + 3))

max_val = round(max(dat_descr$value, na.rm = TRUE))

g_violin <- ggplot(dat_descr, aes(variable, value, fill = variable)) +
  geom_violin() +
  geom_jitter(width = 0.05, height = 0, alpha = 0.1) +
  scale_x_discrete(labels = c("BiAffect", "Oura ring", "Self-report")) +
  scale_y_continuous(minor_breaks = seq(0, max_val)) +
  scale_fill_manual(values = c("lightcoral", "aquamarine3", "skyblue4")) +
  labs(x = NULL, y = "Sleep duration (hour)") +
  # facet_wrap(~ variable, scales = "free_y") +
  theme_minimal() +
  theme(
    text = element_text(size = 16),
    legend.position="none",
    axis.text.x = element_text(size = 16)
  )

g_violin
## Warning: Removed 24691 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 24691 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(file.path(man_img_dir, "violin_sleep.pdf"))
## Saving 7 x 5 in image
## Warning: Removed 24691 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Removed 24691 rows containing missing values or values outside the scale range
## (`geom_point()`).
dat_descr %>%
  drop_na(value) %>%
  group_by(variable) %>%
  summarize(n())
max_lim <- 19
r_coord = 15

tmp <- dat_descr %>%
  pivot_wider(id_cols = c(subject, date), 
              names_from = variable, values_from = value)

gs <- vector("list", length = 3)

gs[[1]] <- ggplot(tmp, aes(hour_estimate, Oura_Sleep_hour)) +
  labs(x = "BiAffect sleep duration (hour)", 
       y = "Oura sleep duration (hour)") +
  geom_label(aes(x, y), label = expression(r[s] == 0.40), parse = TRUE, 
             data = data.frame(x = r_coord, y = r_coord)) # Prevent overdrawing

gs[[2]] <- ggplot(tmp, aes(hour_estimate, sleepdur_yest)) +
  labs(x = "BiAffect sleep duration (hour)", 
       y = "Self-reported sleep duration (hour)") +
  geom_label(aes(x, y), label = expression(r[s] == 0.26), parse = TRUE, 
             data = data.frame(x = r_coord, y = r_coord)) # Prevent overdrawing

gs[[3]] <- ggplot(tmp, aes(sleepdur_yest, Oura_Sleep_hour))+
  labs(x = "Self-reported sleep duration (hour)", 
       y = "Oura sleep duration (hour)") +
  geom_label(aes(x, y), label = expression(r[s] == 0.74), parse = TRUE, 
             data = data.frame(x = r_coord, y = r_coord)) # Prevent overdrawing
  
gs <- lapply(gs, function(g) {
  g +
    geom_jitter(width = 0.2, height = 0.2, alpha = 0.5) +
    xlim(0, max_lim) +
    ylim(0, max_lim) +
    coord_fixed() +
    theme_minimal() +
    theme(text = element_text(size = 16))
})

g_scatter <- ggarrange(plotlist = gs, nrow = 1)
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'
## Warning: Removed 16518 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'
## Warning: Removed 15614 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'
## Warning: Removed 15587 rows containing missing values or values outside the scale range
## (`geom_point()`).
g_scatter

ggarrange(g_violin, g_scatter, labels = c("A", "B"), font.label = list(size = 20), nrow = 2)
## Warning: Removed 24691 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 24691 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(file.path(man_img_dir, "violin_scatter.pdf"))
## Saving 7 x 8 in image

More than enough evidence for random slopes and intercepts:

lmList <- nlme::lmList

m_list <- lmList(Oura_Sleep_hour ~ hour_estimate_c | subject, data = dat_test_oura, na.action = na.omit)

plot(intervals(m_list))

Activity score

No correlation between random slope and intercept.

m1 <- lmer(sleepdur_yest ~ activity_c + (activity_c || subject), dat_test_sr)

summary(m1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: sleepdur_yest ~ activity_c + ((1 | subject) + (0 + activity_c |  
##     subject))
##    Data: dat_test_sr
## 
## REML criterion at convergence: 6279.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8534 -0.5346  0.0367  0.5671  3.4142 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  subject   (Intercept) 0.68807  0.8295  
##  subject.1 activity_c  0.01884  0.1372  
##  Residual              1.56250  1.2500  
## Number of obs: 1857, groups:  subject, 70
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  6.78254    0.10578   64.12
## activity_c  -0.39390    0.04751   -8.29
## 
## Correlation of Fixed Effects:
##            (Intr)
## activity_c 0.004

There is structure in the residuals. This is because the response variable is discrete. Maybe ordinal regression makes more sense here.

plot(m1)

Residuals also show some left-skew.

qqnorm(resid(m1))
qqline(resid(m1))

hist(resid(m1))

Random effects look okay except for that outlier in the bottom left.

re <- ranef(m1)$subject %>%
  rownames_to_column("subject") %>%
  pivot_longer(!subject)

ggplot(re, aes(sample = value)) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap(~ name, scales = "free")

plot(predict_response(m1, terms = "activity_c"), show_data = TRUE) +
  labs(title = "Predicted values of sleep duration", 
       x = "Activity (centred)", y = "Sleep duration") +
  theme(text = element_text(size = 16))
## Data points may overlap. Use the `jitter` argument to add some amount of
##   random variation to the location of data points and avoid overplotting.

Hour estimate

cor.test(dat_test_sr$hour_estimate, dat_test_sr$sleepdur_yest, method = "spearman")
## Warning in cor.test.default(dat_test_sr$hour_estimate,
## dat_test_sr$sleepdur_yest, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  dat_test_sr$hour_estimate and dat_test_sr$sleepdur_yest
## S = 795121818, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2550119
m2_sr <- lmer(sleepdur_yest ~ hour_estimate_c + (hour_estimate_c | subject), 
              dat_test_sr)

summary(m2_sr)
## Linear mixed model fit by REML ['lmerMod']
## Formula: sleepdur_yest ~ hour_estimate_c + (hour_estimate_c | subject)
##    Data: dat_test_sr
## 
## REML criterion at convergence: 6185.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7328 -0.5472  0.0519  0.5475  3.3838 
## 
## Random effects:
##  Groups   Name            Variance Std.Dev. Corr
##  subject  (Intercept)     0.6110   0.7817       
##           hour_estimate_c 0.1228   0.3504   0.12
##  Residual                 1.4529   1.2054       
## Number of obs: 1857, groups:  subject, 70
## 
## Fixed effects:
##                 Estimate Std. Error t value
## (Intercept)      6.86173    0.10174  67.445
## hour_estimate_c  0.46840    0.05919   7.914
## 
## Correlation of Fixed Effects:
##             (Intr)
## hour_stmt_c 0.114

To get the p values.

m2_nlme <- lme(sleepdur_yest ~ hour_estimate_c, 
               data = dat_test_sr, 
               random = ~ hour_estimate_c | subject)

summary(m2_nlme)
## Linear mixed-effects model fit by REML
##   Data: dat_test_sr 
##       AIC      BIC    logLik
##   6197.27 6230.424 -3092.635
## 
## Random effects:
##  Formula: ~hour_estimate_c | subject
##  Structure: General positive-definite, Log-Cholesky parametrization
##                 StdDev    Corr  
## (Intercept)     0.7816851 (Intr)
## hour_estimate_c 0.3503600 0.116 
## Residual        1.2053529       
## 
## Fixed effects:  sleepdur_yest ~ hour_estimate_c 
##                    Value  Std.Error   DF  t-value p-value
## (Intercept)     6.861725 0.10173705 1786 67.44569       0
## hour_estimate_c 0.468400 0.05918546 1786  7.91411       0
##  Correlation: 
##                 (Intr)
## hour_estimate_c 0.114 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.73275027 -0.54720891  0.05188497  0.54745068  3.38377022 
## 
## Number of Observations: 1857
## Number of Groups: 70
plot(m2_sr)

Residuals also show some left-skew.

qqnorm(resid(m2_sr))
qqline(resid(m2_sr))

hist(resid(m2_sr))

Random effects look okay, although the tails can get a little wonky.

re <- ranef(m2_sr)$subject %>%
  rownames_to_column("subject") %>%
  pivot_longer(!subject)

ggplot(re, aes(sample = value)) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap(~ name, scales = "free")

Prediction

plot(predict_response(m2_sr, terms = "hour_estimate_c"), show_data = TRUE, jitter = 0.1) +
  labs(title = "Predicted values of sleep duration", 
       x = "Hour estimate (centred)", y = "Sleep duration") +
  theme(text = element_text(size = 16))

hour_mean <- dat_test_sr$hour_estimate_c %@% "scaled:center"
hour_sd <- dat_test_sr$hour_estimate_c %@% "scaled:scale"

pr <- predict_response(m2_sr, terms = "hour_estimate_c") %>%
  as.data.frame() %>%
  mutate(x = hour_sd * x + hour_mean)

n_presses_breaks = c(400, 1100, 3000, 8100, 22000)

max_est = round(max(dat_test_sr$hour_estimate))

g_pred_sr <- ggplot(pr, aes(x, predicted)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey") +
  geom_line(color = "lightcoral", lwd = 1.2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), 
              fill="lightcoral", alpha = 0.2) +
  geom_jitter(
    aes(hour_estimate, sleepdur_yest, color = n_total_presses), 
    width = 0.2, height = 0.1, alpha = 0.5,
    data = dat_cmplt_sr) +
  scale_color_gradient(name = "Number of key presses", transform = "log",
                       breaks = n_presses_breaks, labels = n_presses_breaks, 
                       high = "aquamarine3") +
  scale_x_continuous(minor_breaks = seq(
    0, max_est
  )) +
  scale_y_continuous(minor_breaks = seq(0, 15), limits = c(0, 15)) + 
  labs(x = "BiAffect-estimated sleep duration (hour)", 
       y = "Self-reported sleep duration (hour)") +
  coord_fixed() +
  theme_minimal() +
  theme(
    text = element_text(size = 20),
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 14)
  )

g_pred_sr

# ggsave(file.path(man_img_dir, "duration_prediction.pdf"))
r_squared <- r2mlm(m2_sr)

# noquote class
vars <- r_squared$Decompositions[,1]
vars_df_sr <- data.frame(variance = unclass(vars)) %>%
  rownames_to_column("component") %>%
  filter(component != "fixed, between") %>%
  mutate(
    component = case_when(
      component == "fixed, within" ~ "Fixed effects",
      component == "slope variation" ~ "Random slope",
      component == "mean variation" ~ "Random intercept",
      component == "sigma2" ~ "Residuals"
    )
  )

vars_df_sr
cols <- rev(c("skyblue2", "skyblue3", "skyblue4", "grey25"))

g_var_sr <- ggplot(vars_df_sr, aes(1, variance)) +
  geom_col(aes(fill = fct_rev(component))) +
  scale_fill_manual(values = cols, name = "Variance component") +
  labs(x = NULL, y = "Proportion of variance") +
  scale_x_continuous(breaks = NULL) +
  scale_y_continuous(position = "right") +
  theme_minimal() +
  theme(
    text = element_text(size = 20),
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 14)
  )

g_var_sr

g_legends <- ggarrange(get_legend(g_pred_sr), get_legend(g_var_sr), ncol = 1, align = "v") 
g_plots <- ggarrange(g_pred_sr, g_var_sr, widths = c(7, 1), legend = "none")

ggarrange(g_plots, g_legends, widths = c(4, 1))

# ggsave(file.path(man_img_dir, "duration_prediction_sr_var.pdf"))

Predicting Oura ring

cortest <- cor.test(dat_test_oura$hour_estimate, dat_test_oura$Oura_Sleep_hour,
                    method = "spearman")
## Warning in cor.test.default(dat_test_oura$hour_estimate,
## dat_test_oura$Oura_Sleep_hour, : Cannot compute exact p-value with ties
cortest
## 
##  Spearman's rank correlation rho
## 
## data:  dat_test_oura$hour_estimate and dat_test_oura$Oura_Sleep_hour
## S = 86428092, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4008605
tmp <- dat_test_oura %>%
  mutate(
    sedative_YN = as.integer(sedative_YN),
    stim_YN = as.integer(stim_YN),
    SSRI = as.integer(SSRI),
    SNRI = as.integer(SNRI),
    age_c = drop(age_c)
  )

m2_oura <- lmer(
  # Oura_Sleep_hour ~ hour_estimate_c + age_c + sedative_YN + stim_YN + SSRI + SNRI + (hour_estimate_c | subject),
  Oura_Sleep_hour ~ hour_estimate_c + (hour_estimate_c | subject),
  tmp)

summary(m2_oura)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Oura_Sleep_hour ~ hour_estimate_c + (hour_estimate_c | subject)
##    Data: tmp
## 
## REML criterion at convergence: 3251.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4676 -0.5692  0.0217  0.5624  3.4339 
## 
## Random effects:
##  Groups   Name            Variance Std.Dev. Corr
##  subject  (Intercept)     0.3992   0.6318       
##           hour_estimate_c 0.2365   0.4863   0.27
##  Residual                 1.5735   1.2544       
## Number of obs: 953, groups:  subject, 40
## 
## Fixed effects:
##                 Estimate Std. Error t value
## (Intercept)      7.09929    0.11635  61.017
## hour_estimate_c  0.69065    0.09868   6.999
## 
## Correlation of Fixed Effects:
##             (Intr)
## hour_stmt_c 0.239
m2_oura_nlme <- lme(Oura_Sleep_hour ~ hour_estimate_c, 
               data = dat_test_oura, 
               random = ~ hour_estimate_c | subject)

summary(m2_oura_nlme)
## Linear mixed-effects model fit by REML
##   Data: dat_test_oura 
##        AIC      BIC    logLik
##   3263.511 3292.657 -1625.756
## 
## Random effects:
##  Formula: ~hour_estimate_c | subject
##  Structure: General positive-definite, Log-Cholesky parametrization
##                 StdDev    Corr  
## (Intercept)     0.6318311 (Intr)
## hour_estimate_c 0.4862812 0.271 
## Residual        1.2544055       
## 
## Fixed effects:  Oura_Sleep_hour ~ hour_estimate_c 
##                    Value Std.Error  DF  t-value p-value
## (Intercept)     7.099294 0.1163505 912 61.01644       0
## hour_estimate_c 0.690655 0.0986821 912  6.99878       0
##  Correlation: 
##                 (Intr)
## hour_estimate_c 0.239 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.46757269 -0.56924621  0.02173227  0.56239771  3.43394523 
## 
## Number of Observations: 953
## Number of Groups: 40
get_coef_table <- function(m) {
  summ <- summary(m)
  tab <- summ$tTable
  tab <- tab[2:nrow(tab),]
  
  betas <- tab[,1]
  ses <- tab[,2]
  p_vals <- tab[,5]
  
  data.frame(beta = betas, SE = ses, p = p_vals)
}

# get_coef_table(m2_oura_nlme)

Cluster-centred predictors. Drawback is that the intercept and cluster (participant) means are almost perfectly correlated.

m2_oura_cluster <- lmer(
  Oura_Sleep_hour ~ hour_estimate_cluster_centered + hour_estimate_cluster_mean + (hour_estimate_cluster_centered | subject),
  dat_test_oura)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00243677 (tol = 0.002, component 1)
summary(m2_oura_cluster)
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## Oura_Sleep_hour ~ hour_estimate_cluster_centered + hour_estimate_cluster_mean +  
##     (hour_estimate_cluster_centered | subject)
##    Data: dat_test_oura
## 
## REML criterion at convergence: 3256.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4812 -0.5742  0.0333  0.5490  3.3276 
## 
## Random effects:
##  Groups   Name                           Variance Std.Dev. Corr
##  subject  (Intercept)                    0.4354   0.6598       
##           hour_estimate_cluster_centered 0.0600   0.2449   0.27
##  Residual                                1.5719   1.2538       
## Number of obs: 953, groups:  subject, 40
## 
## Fixed effects:
##                                Estimate Std. Error t value
## (Intercept)                     5.18302    0.55199   9.390
## hour_estimate_cluster_centered  0.33962    0.05040   6.739
## hour_estimate_cluster_mean      0.22716    0.06907   3.289
## 
## Correlation of Fixed Effects:
##                 (Intr) hr_stmt_clstr_c
## hr_stmt_clstr_c  0.017                
## hr_stmt_clstr_m -0.978  0.023         
## optimizer (nloptwrap) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.00243677 (tol = 0.002, component 1)

Evaluation

plot(m2_oura)

qqnorm(resid(m2_oura))
qqline(resid(m2_oura))

hist(resid(m2_oura))

Random effects look okay except for that outlier in the bottom left.

re <- ranef(m2_oura)$subject %>%
  rownames_to_column("subject") %>%
  pivot_longer(!subject)

ggplot(re, aes(sample = value)) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap(~ name, scales = "free")

r_squared <- r2mlm(m2_oura)

# noquote class
vars <- r_squared$Decompositions[,1]
vars_df <- data.frame(variance = unclass(vars)) %>%
  rownames_to_column("component") %>%
  filter(component != "fixed, between") %>%
  mutate(
    component = case_when(
      component == "fixed, within" | component == "fixed" ~ "Fixed effects",
      component == "slope variation" ~ "Random slope",
      component == "mean variation" ~ "Random intercept",
      component == "sigma2" ~ "Residuals"
    )
  )

vars_df
cols <- rev(c("skyblue2", "skyblue3", "skyblue4", "grey20"))

g_var <- ggplot(vars_df, aes(1, variance)) +
  geom_col(aes(fill = fct_rev(component))) +
  scale_fill_manual(values = cols, name = "Variance component") +
  labs(x = NULL, y = "Proportion of variance") +
  scale_x_continuous(breaks = NULL) +
  scale_y_continuous(position = "right") +
  theme_minimal() +
  theme(
    text = element_text(size = 20),
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 14)
  )

g_var

r2mlm(m2_oura_cluster)

## $Decompositions
##                      total     within   between
## fixed, within   0.10669855 0.13600387        NA
## fixed, between  0.04310256         NA 0.2000358
## slope variation 0.05550070 0.07074427        NA
## mean variation  0.17237164         NA 0.7999642
## sigma2          0.62232655 0.79325186        NA
## 
## $R2s
##          total     within   between
## f1  0.10669855 0.13600387        NA
## f2  0.04310256         NA 0.2000358
## v   0.05550070 0.07074427        NA
## m   0.17237164         NA 0.7999642
## f   0.14980111         NA        NA
## fv  0.20530181 0.20674814        NA
## fvm 0.37767345         NA        NA

Prediction

plot(predict_response(m2_oura, terms = "hour_estimate_c"), show_data = TRUE, jitter = 0.1) +
  labs(title = "Predicted values of sleep duration", 
       x = "Hour estimate (centred)", y = "Sleep duration") +
  theme(text = element_text(size = 16))

hour_mean <- dat_test_oura$hour_estimate_c %@% "scaled:center"
hour_sd <- dat_test_oura$hour_estimate_c %@% "scaled:scale"

pr <- predict_response(m2_oura, terms = "hour_estimate_c") %>%
  as.data.frame() %>%
  mutate(x = hour_sd * x + hour_mean)

n_presses_breaks = c(400, 1100, 3000, 8100, 22000)

max_est = round(max(dat_test_oura$hour_estimate))

g_pred <- ggplot(pr, aes(x, predicted)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey") +
  geom_line(color = "lightcoral", lwd = 1.2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), 
              fill="lightcoral", alpha = 0.2) +
  geom_jitter(
    aes(hour_estimate, Oura_Sleep_hour, color = n_total_presses), 
    width = 0.2, height = 0.1,# alpha = 0.5,
    data = dat_test_oura) +
  scale_color_gradient(name = "Number of key presses", transform = "log",
                       breaks = n_presses_breaks, labels = n_presses_breaks, 
                       high = "aquamarine3") +
  scale_x_continuous(minor_breaks = seq(0, max_est)) +
  scale_y_continuous(minor_breaks = seq(0, 15), limits = c(0, 15)) + 
  labs(x = "BiAffect-estimated sleep duration (hour)", 
       y = "Oura-estimated sleep duration (hour)") +
  coord_fixed() +
  theme_minimal() +
  theme(
    text = element_text(size = 20),
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 14)
  )

g_pred

# ggsave(file.path(man_img_dir, "duration_prediction_oura.pdf"))
g_legends <- ggarrange(get_legend(g_pred), get_legend(g_var), ncol = 1, align = "v") 
g_plots <- ggarrange(g_pred, g_var, widths = c(7, 1), legend = "none")

ggarrange(g_plots, g_legends, widths = c(4, 1))

# ggsave(file.path(man_img_dir, "duration_prediction_oura_var.pdf"))

Demographics

ids <- as.integer(unique(dat_test_sr$subject))

dem_test <- dem %>%
  filter(id %in% ids) %>%
  select(!c(id, race_clear123, ethnicity, bmi, filing_status, sex_orient)) %>%
  select(where(~ !is.numeric(.x) || sum(.x, na.rm = TRUE) > 0)) %>%
  mutate(across(starts_with("currentdx"), as.logical))

tab_txt <- capture.output(CreateTableOne(data = dem_test)) %>%
  str_replace("^\\s\\s", "") %>%
  str_replace("(\\w|\\))\\s+(\\d.*)", "\\1;\\2") %>%
  str_replace_all("currentdx_(.+) = TRUE \\(%\\);(.*)", "   \\1;\\2")

cat(tab_txt, sep = "\n")
##                                 
##                                  Overall      
## n;70        
## Age (mean (SD));26.75 (4.62) 
## Gender (%)                                    
##    Female;65 (92.9) 
##    Nonbinary;4 ( 5.7) 
##    Other;1 ( 1.4) 
## House income (%)                              
##    less than $15,000;5 ( 7.9) 
##    $15,000 - $19,999;0 ( 0.0) 
##    $20,000 - $24,999;3 ( 4.8) 
##    $25,000 - $29,999;5 ( 7.9) 
##    $30,000 - $34,999;2 ( 3.2) 
##    $35,000 - $39,999;4 ( 6.3) 
##    $40,000 - $49,999;5 ( 7.9) 
##    $50,000 - $79,999;16 (25.4) 
##    $80,000 - $99,999;9 (14.3) 
##    $100,000 or above;14 (22.2) 
##    bipolar2;3 ( 4.5) 
##    MDD;38 (56.7) 
##    PDD;27 (40.3) 
##    othDD;1 ( 1.5) 
##    SUDalc;9 (13.4) 
##    SUDsha;1 ( 1.5) 
##    SUDmj;6 ( 9.0) 
##    SUDother;1 ( 1.5) 
##    PanicDO;4 ( 6.0) 
##    Agoraphobia;5 ( 7.5) 
##    SAD;24 (35.8) 
##    Phobi;7 (10.4) 
##    GAD;30 (44.8) 
##    OtherAnx;1 ( 1.5) 
##    OCD;5 ( 7.5) 
##    otherOC;1 ( 1.5) 
##    BED;3 ( 4.5) 
##    otheat;3 ( 4.5) 
##    ADHD;14 (20.9) 
##    PTSD;12 (17.9) 
##    Othertrauma;2 ( 3.0) 
##    PMDD_prov;10 (15.9) 
##    BPD;4 ( 6.0)
ids <- as.integer(unique(dat_test_oura$subject))

dem_test <- dem %>%
  filter(id %in% ids) %>%
  select(!c(id, race_clear123, ethnicity, bmi, filing_status, sex_orient)) %>%
  select(where(~ !is.numeric(.x) || sum(.x, na.rm = TRUE) > 0)) %>%
  mutate(across(starts_with("currentdx"), as.logical))

tab_txt <- capture.output(CreateTableOne(data = dem_test)) %>%
  str_replace("^\\s\\s", "") %>%
  str_replace("(\\w|\\))\\s+(\\d.*)", "\\1;\\2") %>%
  str_replace_all("currentdx_(.+) = TRUE \\(%\\);(.*)", "   \\1;\\2")

cat(tab_txt, sep = "\n")
##                                 
##                                  Overall      
## n;40        
## Age (mean (SD));26.77 (4.15) 
## Gender (%)                                    
##    Female;36 (90.0) 
##    Nonbinary;3 ( 7.5) 
##    Other;1 ( 2.5) 
## House income (%)                              
##    less than $15,000;4 (11.4) 
##    $15,000 - $19,999;0 ( 0.0) 
##    $20,000 - $24,999;3 ( 8.6) 
##    $25,000 - $29,999;3 ( 8.6) 
##    $30,000 - $34,999;2 ( 5.7) 
##    $35,000 - $39,999;3 ( 8.6) 
##    $40,000 - $49,999;1 ( 2.9) 
##    $50,000 - $79,999;9 (25.7) 
##    $80,000 - $99,999;5 (14.3) 
##    $100,000 or above;5 (14.3) 
##    bipolar2;2 ( 5.3) 
##    MDD;23 (60.5) 
##    PDD;17 (44.7) 
##    SUDalc;4 (10.5) 
##    SUDmj;3 ( 7.9) 
##    PanicDO;2 ( 5.3) 
##    Agoraphobia;2 ( 5.3) 
##    SAD;10 (26.3) 
##    Phobi;4 (10.5) 
##    GAD;17 (44.7) 
##    OtherAnx;1 ( 2.6) 
##    OCD;3 ( 7.9) 
##    otherOC;1 ( 2.6) 
##    BED;3 ( 7.9) 
##    otheat;1 ( 2.6) 
##    ADHD;9 (23.7) 
##    PTSD;6 (15.8) 
##    PMDD_prov;3 ( 8.6) 
##    BPD;1 ( 2.6)

Druijff-Van de Woestijne

dat_dvdw <- read_parquet(file.path(dat_dir, "dat_dvdw.parquet"))

colnames(dat_dvdw)
## [1] "subject"             "date"                "kap"                
## [4] "n_active_hours"      "prev_n_active_hours" "next_n_active_hours"
dat_dvdw_oura <- dat_dvdw %>%
  left_join(dat_oura %>% mutate(id = as.character(id)), 
            by = c("subject" = "id", "date" = "daterated")) %>%
  drop_na(Oura_Sleep_TST) %>%
  mutate(
    kap_c = drop(scale(kap)),
    n_active_hours_c = drop(scale(n_active_hours)),
    prev_n_active_hours_c = drop(scale(prev_n_active_hours))
  )

colnames(dat_dvdw_oura)
##  [1] "subject"               "date"                  "kap"                  
##  [4] "n_active_hours"        "prev_n_active_hours"   "next_n_active_hours"  
##  [7] "cleartrialphase"       "sleepdur_yest"         "SleepLNQuality"       
## [10] "Oura_Sleep_TST"        "sedative_YN"           "stim_YN"              
## [13] "SSRI"                  "SNRI"                  "bzd_YN"               
## [16] "Oura_Sleep_hour"       "kap_c"                 "n_active_hours_c"     
## [19] "prev_n_active_hours_c"
m_dvdw <- lmer(Oura_Sleep_hour ~ kap_c + n_active_hours_c + prev_n_active_hours_c + (1 | subject),
               dat_dvdw_oura)

summary(m_dvdw)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Oura_Sleep_hour ~ kap_c + n_active_hours_c + prev_n_active_hours_c +  
##     (1 | subject)
##    Data: dat_dvdw_oura
## 
## REML criterion at convergence: 3583.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8836 -0.5606  0.0220  0.5891  3.6701 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  subject  (Intercept) 0.5297   0.7278  
##  Residual             1.5259   1.2353  
## Number of obs: 1069, groups:  subject, 41
## 
## Fixed effects:
##                       Estimate Std. Error t value
## (Intercept)            6.96366    0.12332  56.468
## kap_c                  0.62161    0.04385  14.176
## n_active_hours_c      -0.10696    0.04368  -2.449
## prev_n_active_hours_c -0.03105    0.03855  -0.806
## 
## Correlation of Fixed Effects:
##             (Intr) kap_c  n_ct__
## kap_c       -0.001              
## n_ctv_hrs_c  0.000  0.471       
## prv_n_ctv__  0.000 -0.025 -0.028
m_dvdw_nlme <- lme(Oura_Sleep_hour ~ kap_c + n_active_hours_c + prev_n_active_hours_c, 
                   data = dat_dvdw_oura,
                   random = ~ 1 | subject,
                   na.action = na.omit)

summary(m_dvdw_nlme)
## Linear mixed-effects model fit by REML
##   Data: dat_dvdw_oura 
##        AIC      BIC    logLik
##   3595.784 3625.609 -1791.892
## 
## Random effects:
##  Formula: ~1 | subject
##         (Intercept) Residual
## StdDev:   0.7277995 1.235264
## 
## Fixed effects:  Oura_Sleep_hour ~ kap_c + n_active_hours_c + prev_n_active_hours_c 
##                           Value  Std.Error   DF  t-value p-value
## (Intercept)            6.963658 0.12331942 1025 56.46846  0.0000
## kap_c                  0.621613 0.04384981 1025 14.17596  0.0000
## n_active_hours_c      -0.106959 0.04368049 1025 -2.44866  0.0145
## prev_n_active_hours_c -0.031052 0.03854545 1025 -0.80560  0.4207
##  Correlation: 
##                       (Intr) kap_c  n_ct__
## kap_c                 -0.001              
## n_active_hours_c       0.000  0.471       
## prev_n_active_hours_c  0.000 -0.025 -0.028
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.88362503 -0.56064334  0.02198986  0.58905176  3.67013387 
## 
## Number of Observations: 1069
## Number of Groups: 41
plot(m_dvdw)

qqnorm(resid(m_dvdw))
qqline(resid(m_dvdw))

hist(resid(m_dvdw))

re <- ranef(m_dvdw)$subject %>%
  rownames_to_column("subject") %>%
  pivot_longer(!subject)

ggplot(re, aes(sample = value)) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap(~ name, scales = "free")

plot(predict_response(m_dvdw, terms = "kap_c"), show_data = TRUE) +
  xlim(-4, 4)
## Data points may overlap. Use the `jitter` argument to add some amount of
##   random variation to the location of data points and avoid overplotting.

kap_mean <- mean(dat_dvdw_oura$kap)
kap_sd <- sd(dat_dvdw_oura$kap)

pr <- predict_response(m_dvdw, terms = "kap_c") %>%
  as.data.frame() %>%
  mutate(x = kap_sd * x + kap_mean)

n_presses_breaks = c(400, 1100, 3000, 8100, 22000)

g_dvdw <- ggplot(pr, aes(x, predicted)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey") +
  geom_line(color = "lightcoral", lwd = 1.2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), fill="lightcoral", alpha = 0.2) +
  geom_point(
    aes(kap, Oura_Sleep_hour),
    data = dat_dvdw_oura) +
  # scale_color_gradient(name = "Number of key presses", transform = "log",
  #                      breaks = n_presses_breaks, labels = n_presses_breaks, high = "aquamarine3") +
  scale_x_continuous(minor_breaks = ~ seq(0, 15)) +
  labs(x = "Keystroke-absence period (hour)", y = "Oura-estimated sleep duration (hour)") +
  theme_minimal() +
  theme(
    text = element_text(size = 20)
  )

# ggsave(file.path(man_img_dir, "duration_prediction_dvdw.pdf"))

g_dvdw

r_squared <- r2mlm(m_dvdw)

# noquote class
vars <- r_squared$Decompositions[,1]
vars_df <- data.frame(variance = unclass(vars)) %>%
  rownames_to_column("component") %>%
  filter(component != "fixed, between", component != "slope variation") %>%
  mutate(
    component = case_when(
      component == "fixed" ~ "Fixed effects",
      component == "mean variation" ~ "Random intercept",
      component == "sigma2" ~ "Residuals"
    )
  )

vars_df
cols <- rev(c("aquamarine2", "aquamarine3", "grey20"))

g_var_dvdw <- ggplot(vars_df, aes(1, variance)) +
  geom_col(aes(fill = fct_rev(component))) +
  scale_fill_manual(values = cols, name = "Variance component") +
  labs(x = NULL, y = "Proportion of variance") +
  scale_x_continuous(breaks = NULL) +
  scale_y_continuous(position = "right") +
  theme_minimal() +
  theme(
    text = element_text(size = 20),
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 14)
  )

g_var_dvdw

ggarrange(g_dvdw, g_var_dvdw, widths = c(5, 2))

# ggsave(file.path(man_img_dir, "duration_prediction_oura_var_dvdw.pdf"))